# The sentiment function takes a really long time so I created a new data file so you don't have to run it
us_tweets <- read_csv("us_tweets.csv") 
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   tweet_id = col_double(),
##   date = col_date(format = ""),
##   hour = col_time(format = ""),
##   user_name = col_character(),
##   nickname = col_character(),
##   bio = col_character(),
##   tweet_content = col_character(),
##   latitude = col_double(),
##   longitude = col_double(),
##   country = col_character(),
##   place_as_appears_on_bio = col_character(),
##   profile_picture = col_character(),
##   tweet_url = col_character()
## )
## See spec(...) for full column specifications.
#gets rid of non alphabetic characters  
us_tweets$tweet_content_stripped <- gsub("[^[:alpha:] ]", "",
                                         us_tweets$tweet_content) 


#removes all words that are 1-2 letters long
us_tweets$tweet_content_stripped <- gsub(" *\\b[[:alpha:]]{1,2}\\b *", " ",
                                         us_tweets$tweet_content_stripped) 
sentimentTotals <- data.frame(colSums(us_tweets[,c(20:27)]))

names(sentimentTotals) <- "count"

sentimentTotals <- cbind("sentiment" = rownames(sentimentTotals),
                         sentimentTotals)

sentimentTotals
##                 sentiment count
## anger               anger 13605
## anticipation anticipation 52960
## disgust           disgust 12668
## fear                 fear 19942
## joy                   joy 46690
## sadness           sadness 21882
## surprise         surprise 22067
## trust               trust 76347
us_tweets_long <- gather(us_tweets, sentiment, count, anger:trust, 
                         factor_key = TRUE)
us_tweets$hour <- as.POSIXct(us_tweets$hour, format = " %H:%M")

ggplot(data = us_tweets, aes(x = hour)) +
  geom_histogram(stat = "count") +
  xlab("Time") + ylab("Proportion of tweets") +
  ggtitle("Number of Tweets per Hour") +
  scale_x_datetime(labels = date_format("%H:%M"))
## Warning: Ignoring unknown parameters: binwidth, bins, pad

us_tweets$charsintweet <- sapply(us_tweets$tweet_content, function(x) nchar(x))

ggplot(data = us_tweets, aes(x = charsintweet)) +
  geom_histogram(aes(fill = ..count..), binwidth = 8) +
  theme(legend.position = "none") +
  xlab("Characters per Tweet") + 
  ylab("Number of tweets") + 
  scale_fill_gradient(low = "midnightblue", high = "aquamarine4") + 
  xlim(0,150) + 
  ggtitle("Characters per Tweet")
## Warning: Removed 6 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing missing values (geom_bar).

ggplot(data = sentimentTotals, aes(x = sentiment, y = count)) +
  geom_bar(aes(fill = sentiment), stat = "identity") +
  theme(legend.position = "none") +
  xlab("Sentiment") + 
  ylab("Total Count") + 
  ggtitle("Total Sentiment Score for All Tweets in Sample")

tweet_words <- us_tweets %>% 
  unnest_tokens(word, tweet_content_stripped)

data(stop_words)

tweet_words <-  
  anti_join(tweet_words, stop_words)
## Joining, by = "word"
tweet_words %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 200, 
                 random.order = FALSE, 
                 rot.per = 0.35,  
                 colors = brewer.pal(2, "Dark2")))
## Warning in brewer.pal(2, "Dark2"): minimal value for n is 3, returning requested palette with 3 different levels

pal2 <- brewer.pal(8,"Dark2")

tweet_words %>% 
  count(word, sort = TRUE) %>% 
  top_n(10) %>% 
  mutate(word = fct_reorder(word, n)) %>% 
  ggplot(aes(x = word, y = n)) + 
  geom_bar(stat = "identity", fill = "blue", alpha = .6) + 
  coord_flip()
## Selecting by n

hashtags <- str_extract_all(us_tweets$tweet_content, "#\\S+")
hashtags <- unlist(hashtags)
hashtags <- gsub("[^[:alnum:] ]", "", hashtags)
hashtags <- tolower(hashtags)
hashtag.df <- data.frame(table(hashtags))
hashtag.df$hashtags <- as.character(hashtag.df$hashtags)
hashtag.df$Freq <- as.numeric(as.character(hashtag.df$Freq))
hashtag.df <- arrange(hashtag.df, desc(Freq))
print(hashtag.df[1:20,])
##           hashtags  Freq
## 1              job 51511
## 2           hiring 45428
## 3             jobs 21910
## 4        careerarc 20717
## 5           retail  7454
## 6      hospitality  7311
## 7          nursing  5091
## 8       healthcare  4702
## 9         veterans  4471
## 10           sales  3310
## 11              it  2179
## 12 customerservice  1927
## 13  transportation  1568
## 14           sonic  1520
## 15   manufacturing  1476
## 16           photo  1432
## 17    businessmgmt  1348
## 18      accounting  1053
## 19     engineering   970
## 20         traffic   955
us_tweets %>%
  filter(country == "US") %>% 
  mutate(text_label = str_c("followers: ", followers, '\nlocation: ', place_as_appears_on_bio)) %>%
  plot_ly(x = ~longitude, y = ~latitude, type = "scatter", mode = "markers",
          alpha = 0.5, 
          color = ~followers, text = ~text_label)
us_tweets %>%
  filter(country == "US") %>% 
  mutate(text_label = str_c("sentiment: ", positive, '\nlocation: ', place_as_appears_on_bio)) %>% 
  plot_ly(x = ~longitude, y = ~latitude, type = "scatter", mode = "markers",
          alpha = 0.5, 
          color = ~positive, colors = "Set2", text = ~text_label)
# map specific name of sentiment

# filter(us_tweets$country == "US") %>% 
#   mutate(text_label = str_c("sentiment: ", sentimentTotals$sentiment, '\nlocation: ', us_tweets$place_as_appears_on_bio)) %>% 
#   plot_ly(x = ~us_tweets$longitude, y = ~us_tweets$latitude, type = "scatter", mode = "markers",
#           alpha = 0.5, 
#           color = ~sentimentTotals$sentiment, colors = "Set2", text = ~text_label)

#how to map sentiments (ie for slider?)
#how to map interactive tweets per hour?

us_tweets$hour <- as.POSIXct(us_tweets$hour, format = " %H:%M")

us_tweets %>%
  filter(country == "US") %>% 
  plot_ly(x = ~longitude, y = ~latitude, type = "scatter", mode = "markers",
          alpha = 0.5, 
          color = ~hour)